/*
	Purpose: Attach occ codes to import_cleaning_AVTMH1957.dta
	Note: "Adult child" and "R" refer to the same person.
	Creates: AVTMH1957_IGEanalysis.dta
*/

clear

cd "$Mydirectory1/1_DataSources/Americans_MentalHealth_1957/"

use "./output/import_cleaning_AVTMH1957.dta"

*** Weight
*There's no weight provided with the  dataset (as it is nationally representative, according to sampling info section in codebook).
gen weight_AVTMH57 =1

/*------------------------------------------------------------------------------------------

											CLEANING

------------------------------------------------------------------------------------------*/

***********************************************
* Demographics *
***********************************************

*****************
* Gender *
*****************

rename v293 sex

*****************
* Age *
*****************

rename v299 age
gen agesq = age*age

******************
* Marital Status *
******************

gen married = (v80==1)
tab married, missing

gen widowed = (v80==3)
tab widowed, m

gen divorced = (v80==4)
tab divorced, m

gen separated = (v80==5)
tab separated, m 

gen never_married = (v80==2) 
tab never_married, m

******************
* Race *
******************

rename v294 race
replace race =. if (race==8 | race==9) 

gen black = (race==2)
replace black =. if race==.
tab black, missing
label var black "Dummy =1 IF RESPONDENT IS BLACK"

*note: father race is not available

******************
* Foreign Born *
******************

*respondent
replace v304=. if v304==9

gen foreignborn = (v304==7) if v304<.
tab foreignborn, m

keep if (foreignborn==0 | foreignborn==.)

*father
replace v306=. if v306==9

gen fatherforeign = inlist(v306,4,5) if v306<.
replace fatherforeign =1 if v306==0 & race==1 & v304==7 
replace fatherforeign=0 if 306==0 & race==1 & inrange(v304,1,6) 
replace fatherforeign=1 if v306==0 & race==2 & v304==7 
replace fatherforeign=0 if v306==0 & race==2 & inrange(v304,1,6) 
replace fatherforeign=1 if v306 ==0 & race==. 
tab fatherforeign, m

*-------------------------------------------------------------------------------------------------*
*-------------------------------------------------------------------------------------------------*

*************************************************************************************
* State/Region Where Respondent Grew Up or Was Born *
*************************************************************************************

***********************
* State R was born in * 
***********************

*not available 

***********************
* Region R was born in * 
***********************

gen region4_born=.
replace region4_born=1 if v304==1 /*northeast*/
replace region4_born=2 if v304==2 /*midwest*/
replace region4_born=3 if (v304==3 | v304==4) /*south*/
replace region4_born=4 if v304==5 /*west*/
tab region4_born, m
label var region4_born "Region R born"

******************************************
* Whether R was born in the south * 
******************************************

gen bornsouth = (region4_born==3) if region4_born~=.
tab bornsouth, m

******************************************
* Whether R has moved region since birth * 
******************************************

*no way to tell if R has moved since birth

*****************************************
* State/Region Where Respondent Grew Up *
*****************************************

*not available

******************************************
* Region R currently resides in * 
******************************************

*not available

*-------------------------------------------------------------------------------------------------*
*-------------------------------------------------------------------------------------------------*

*****************
* Employment *
*****************

replace v177 =. if v177==9

gen employed = inrange(v177,1,3) if v177<.
tab employed, m

gen selfemployed = (inrange(v190,30,81)) if v190<91
tab selfemployed, m
label var selfemployed "R is self-employed"

************************
* Education-Respondent *
************************

gen eduR =.
replace eduR =0 if v300==0 // no schooling
replace eduR =1 if v300==1 // some grade school
replace eduR =2 if v300==2 // completed grade school
replace eduR =3 if v300==3 // some hs
replace eduR =4 if (v300==4 | v300==5) // completed hs
replace eduR =5 if (v300==6 | v300==7) //some college or other schooling past hs
replace eduR =6 if v300==8 // completed college

tab eduR, missing
label variable eduR "Education of Respondent (6 categories)"

*dummy for hs ed or greater for respondent
gen hs_ed = eduR>=4 if eduR<. 
tab hs_ed, m
label var hs_ed "HS educated" 

*dummy for college ed or greater for respondent
gen coll_ed = eduR>=6 if eduR<. 
tab coll_ed, m
label var coll_ed "Coll educated"

*note: no years of schooling variable available

*binned version of yrsschool
gen yrsschool_bin=.
replace yrsschool_bin = 0 if eduR==0 /* none */
replace yrsschool_bin = 6 if eduR==1 /* some grade school */ 
replace yrsschool_bin = 8 if eduR==2 /*  grade school */ 
replace yrsschool_bin = 10 if eduR==3 /* some hs */ 
replace yrsschool_bin = 12 if eduR==4 /* hs */ 
replace yrsschool_bin = 14 if eduR==5 /* some college */ 
replace yrsschool_bin = 16 if eduR==6 /* college or beyond */ 
label var yrsschool_bin "Years of school, binned"

************************
* Education-Parents *
************************

*not available

*************************
* R's hh size *
*************************

do ./code/1a_AVTMH1957_R_hhsize.do

******************
* Unions *
******************

* note available

*******************
* Veterans *
*******************

* note available

/*------------------------------------------------------------------------------------------

Crosswalking 1957 AVTMH occupations to 1950 ANES occupations for fathers

------------------------------------------------------------------------------------------*/

sort v310
rename v310 occupation

merge m:1 occupation using "../Crosswalks/Crosswalk_occ_AVTMH1957_toANES.dta"
assert occupation==. if _merge==1

drop if _merge==2
drop _merge

rename occ1950_ej occ1950ej
label variable occ1950ej "Dad's Occupation (based on ANES categories)"
label variable occupation "Dad's Occupation (AVTMH categories), detailed"

*************************************************************************
* Fix occupations for self-employed businessmen, managers, or officials * 
*************************************************************************
/*Info on class of work (i.e. gov't, private, self-employed, etc) not 
  available for father. Can't adjust father occupation.*/

*renaming father occ variable to match the variable name in other datasets
rename occ1950ej fatheroccej

*****************************************************************
*****************************************************************
* Correct dad occ + make mother occ & hh head binary indicators *
*****************************************************************
*****************************************************************

//mom occ is not available

/*Following the coding in other surveys: 
--headofhh_father =1 if R lived with both parents until age 16 or if R reports having lived with only a father. 
--headofhh_mother =1 if R lived with a mother but not a father. 
--headofhh_othermale =1 if R lived with a male relative and not with R's parents.
--headofhh_otherfemale =1 if R lived with a female relative and not with R's parents or a male relative. 
*/

gen headofhh_father =.
replace headofhh_father = (v314==1) if v314<.
replace headofhh_father = 1 if headofhh_father==. & v314==5 & inlist(v316,15,20,25,50) //note: Assumed here that if R lived with adoptive parents, a father was one of the adoptive parents. 
replace headofhh_father = 0 if headofhh_father==. & v314==5 & inlist(v316,10,30,40,60,70,80)
tab headofhh_father,m 

foreach name in mother othermale otherfemale {
	gen headofhh_`name' =.
	replace headofhh_`name' = 0 if headofhh_father==1
	
	if "`name'" =="mother" replace headofhh_`name' =1 if headofhh_`name'==. & v314==5 & v316==10
	else replace headofhh_`name' =0 if headofhh_`name'==. & headofhh_mother==1 
	
	if "`name'" =="othermale" replace headofhh_`name' =1 if headofhh_`name'==. & v314==5 & v316==30
	if "`name'"=="otherfemale" replace headofhh_`name' =0 if headofhh_`name'==. & headofhh_othermale==1
	else 

	tab headofhh_`name',m 
}

	replace headofhh_mother =0 if headofhh_othermale==1 & headofhh_mother==.
	tab headofhh_mother,m 


//note: avtmh57 provides no info about who was head of hh when R was growing up. Assumed that for all obs with non-missing father occ, their hh head was the father.
gen headofhh_father_imputed = headofhh_father 
label var headofhh_father_imputed "Impute dad when parent occ!=missing & no info about hh head when R was kid"

*****************************************************************************************************
* DUMMIES FOR WHEN WE KNOW WHY DAD OR MOM DIDN'T WORK (I.E. WHY FATHEROCCEJ/MOTHEROCCEJ IS MISSING) *
*****************************************************************************************************
gen father_notworking = (fatheroccej ==. & occupation==910) 
replace father_notworking =. if fatheroccej ==. & occupation>=980
tab father_notworking, m

//mother_ notworking--not available.

* Variable for father being either farm laborer or operator
gen fatherfarm=0
replace fatherfarm=. if fatheroccej==.
replace fatherfarm=1 if fatheroccej==71 | fatheroccej==81
tab fatherfarm, m

*temporarily rename fatheroccej (for the DAD) so that income scores crosswalk can be merged again with adult child's occ
rename fatheroccej father_occ1950ej

/*------------------------------------------------------------------------------------------

Crosswalking 1957 AVTMH occupations to 1950 ANES occupations for adult children 								

------------------------------------------------------------------------------------------*/
sort v179
preserve
use "../Crosswalks/Crosswalk_occ_AVTMH1957_toANES.dta", clear

rename occupation occupation_child
tempfile crossw
save `crossw'

restore 

rename v179 occupation_child
merge m:1 occupation_child using `crossw'
drop if _merge==2
drop _merge

rename occ1950_ej fatheroccej
label variable fatheroccej "Adult child's Occupation (based on ANES categories)"
label variable occupation_child "Adult child's Occupation (AVTMH categories), detailed"

*************************************************************************
* Fix occupations for self-employed businessmen, managers, or officials * 
*************************************************************************
replace fatheroccej=21 if fatheroccej==28 & selfemployed==1 

*renaming to match other data files 
rename fatheroccej occRej
rename father_occ1950ej fatheroccej 

/*------------------------------------------------------------------------------------------

									Family Income						

------------------------------------------------------------------------------------------*/

*logged family income (value)--family income is not continuous in this survey.

/*
The midpoint of each bin is assigned, with the exception of:
 (1) the last bin, whose bottom value is multiplied by 1.25 (as last bin is always "open-ended"--i.e. "25,000 or more")
 (2) the bottom bin, whose top value is multiplied by 0.75
*/
gen fam_inc=.
replace fam_inc= .75*1000 if v320==10 
replace fam_inc= 1500  if v320==15
replace fam_inc= 2500  if v320==20
replace fam_inc= 3500  if v320==25
replace fam_inc= 4500  if v320==30
replace fam_inc= 5500  if v320==35
replace fam_inc= 6500  if v320==40
replace fam_inc= 7500  if v320==50
replace fam_inc= 8500  if v320==60
replace fam_inc= 12500 if v320==70
replace fam_inc= 1.25*15000 if v320==80
label var fam_inc "Binned family income (based on midpoints of each bin)"
tab fam_inc, missing

//note: the suffix "_son" is used to match the variable names in other datasets. All respondents (i.e., male and female) are given a value for these variables. 
gen bottomcoded_son = fam_inc==.75*1000 if fam_inc<.
gen topcoded_son = fam_inc==1.25*15000 if fam_inc<.

/* Turn fam_inc into 1950 dollars using the CPI: https://data.bls.gov/timeseries/CUUR0000SA0 */
gen CPI1950 = 24.1
gen CPI1957 = 28.1 

gen fam_inc_real =.
replace fam_inc_real = fam_inc * (CPI1950/CPI1957) 
label var fam_inc_real "Binned Family income, in 1950 dollars"
	
gen lnfaminc=ln(fam_inc_real)
label var lnfaminc "Logged family income, binned and real"

/*------------------------------------------------------------------------------------------

									Birth cohorts							

------------------------------------------------------------------------------------------*/

gen year = 1957

******************************************************************************************************************
* DOB variable 
******************************************************************************************************************
/*
	Note: No continuous age variable is available in this survey. 
		  Instead, age is binned, with each bin spanning 5 years. 
		  The following coding first cleans the age variable and 
		  then randomly assigns a birth date to each respondent
		  via a random number generator. Age is then calculated.
*/

keep if age>=30 & age<=49

decode age, gen(strage) maxlength(5) 
tab strage, missing

split strage, parse("-")
destring strage*, replace
rename strage1 alower
rename strage2 aupper

* Obtain dob range
gen dob1 = 1957 - alower
tab dob1 , m

gen dob2 = 1957 - aupper
tab dob2 , m

tostring(dob1), gen(strdob1)
tostring(dob2), gen(strdob2)
gen strarange = strdob2 + "-" + strdob1 
tab strarange, m

**only 4 possible dob ranges: 1908-1912,1913-1917,1918-1922, 1923-1927
drop strage strd* dob* alower aupper

**************
*creating DOB (with help of strage)
**************

/*note: Sorting by interview number (unique to each respondent) ensures that the number of people per birth year remains the same if run again */

sort v2
set seed 1234 
gen randnum = runiformint(1,100) 

gen dob =.

replace dob = 1908 if inrange(randnum,1,20) & strarange=="1908-1912"
replace dob = 1909 if inrange(randnum,21,40) & strarange=="1908-1912"
replace dob = 1910 if inrange(randnum,41,60) & strarange=="1908-1912"
replace dob = 1911 if inrange(randnum,61,80) & strarange=="1908-1912"
replace dob = 1912 if inrange(randnum,81,100) & strarange=="1908-1912"

replace dob = 1913 if inrange(randnum,1,20) & strarange=="1913-1917"
replace dob = 1914 if inrange(randnum,21,40) & strarange=="1913-1917"
replace dob = 1915 if inrange(randnum,41,60) & strarange=="1913-1917"
replace dob = 1916 if inrange(randnum,61,80) & strarange=="1913-1917"
replace dob = 1917 if inrange(randnum,81,100) & strarange=="1913-1917"

replace dob = 1918 if inrange(randnum,1,20) & strarange=="1918-1922"
replace dob = 1919 if inrange(randnum,21,40) & strarange=="1918-1922"
replace dob = 1920 if inrange(randnum,41,60) & strarange=="1918-1922"
replace dob = 1921 if inrange(randnum,61,80) & strarange=="1918-1922"
replace dob = 1922 if inrange(randnum,81,100) & strarange=="1918-1922"

replace dob = 1923 if inrange(randnum,1,20) & strarange=="1923-1927"
replace dob = 1924 if inrange(randnum,21,40) & strarange=="1923-1927"
replace dob = 1925 if inrange(randnum,41,60) & strarange=="1923-1927"
replace dob = 1926 if inrange(randnum,61,80) & strarange=="1923-1927"
replace dob = 1927 if inrange(randnum,81,100) & strarange=="1923-1927"

tab dob, missing

gen decade =.
replace decade = 1900 if inrange(dob,1900,1909)
replace decade = 1910 if inrange(dob,1910,1919)
replace decade = 1920 if inrange(dob,1920,1929)
tab decade, missing
label var decade "Decade of Birth"

tab decade, gen(decade_)

* Recode age (currently a range--e.g. age ==30 is =30-34) to be survey year - birth year
tab age
tab age, nol
drop age
gen age = year - dob 
tab age, m


/*------------------------------------------------------------------------------------------

									Interactions			

------------------------------------------------------------------------------------------*/

***************
*** INTERACTIONS 
***************

	global institution_list "black hs_ed coll_ed"

* Demean the variables that we will use
	foreach var in $institution_list {
	sum `var'
	gen `var'_dm = `var'- `r(mean)'
	local temp: var label `var' 
	label var `var'_dm "`temp', demeaned" 
	}

/*------------------------------------------------------------------------------------------

										Save			

------------------------------------------------------------------------------------------*/

gen faminc_missing = fam_inc_real==.
label var faminc_missing "Family income missing"

*create unique id for each obs--in this dataset, the unique identifier is v2
ren v2 id_avtmh57 
label var id_avtmh57 "INTERVIEW NUMBER (unique identifier)"

duplicates report id_avtmh57 //no duplicates reported

* Restrict sample and save
compress
sort id_avtmh57
order id_avtmh57 weight_AVTMH57
ren *_AVTMH57 *_avtmh57
save "./output/AVTMH1957_IGEanalysis.dta", replace

